Effect of UPSTM-Based
Decorrelation on Feature Discovery
Loading the
libraries
library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
library(TH.data)
library(psych)
library(whitening)
library("vioplot")
library("rpart")
library(mlbench)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Material and
Methods
Source W. Nick Street, Olvi L. Mangasarian and William H. Wolberg
(1995). An inductive learning approach to prognostic prediction. In A.
Prieditis and S. Russell, editors, Proceedings of the Twelfth
International Conference on Machine Learning, pages 522–530, San
Francisco, Morgan Kaufmann.
Peter Buehlmann and Torsten Hothorn (2007), Boosting algorithms:
regularization, prediction and model fitting. Statistical Science,
22(4), 477–505.
The Data
wpbc {TH.data}
data("wpbc", package = "TH.data")
sum(1*(wpbc$status=="R" & wpbc$time <= 24))
#> [1] 29
wpbc <- subset(wpbc,time > 36 | (status=="R" & time <= 36))
summary(wpbc$time)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1.00 36.75 60.50 58.79 78.75 125.00
wpbc$status <- 1*(wpbc$status=="R" & wpbc$time <= 36)
wpbc <- wpbc[complete.cases(wpbc),]
pander::pander(table(wpbc$status))
wpbc$time <- NULL
Comparing IDeA vs PCA
vs EFA
IDeA
IDeAwpbc <- IDeA(wpbc)
#plot(IDeAwpbc[,colnames(IDeAwpbc)!="status"],col=wpbc$status,cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)
#pander::pander(attr(IDeAwpbc,"UPSTM"))
pander::pander(getLatentCoefficients(IDeAwpbc))
La_mean_perimeter:
La_mean_area:
La_mean_concavity:
La_mean_concavepoints:
La_mean_fractaldim:
La_SE_perimeter:
La_SE_area:
La_SE_concavity:
La_SE_fractaldim:
La_worst_radius:
La_worst_texture:
La_worst_perimeter:
La_worst_area:
La_worst_compactness:
IDeACor <- cor(IDeAwpbc[,colnames(IDeAwpbc)!="status"])
gplots::heatmap.2(abs(IDeACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "IDeA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")

diag(IDeACor) <- 0;
getMaxcor <- apply(IDeACor,2,max)
topCorrelated <- getMaxcor[which.max(getMaxcor)]
whotoMax <- getMaxcor[getMaxcor == topCorrelated]
plot(IDeAwpbc[,names(whotoMax)],main="IDeA: Top Correlated variables")

plot(IDeAwpbc[,c("mean_radius","La_mean_perimeter")],main="IDeA: Top Raw Correlated variables")

PCA
featuresnames <- colnames(wpbc)[colnames(wpbc) != "status"]
pc <- prcomp(wpbc[,featuresnames],center = TRUE,tol=0.01) #principal components
PCAwpbc <- as.data.frame(cbind(predict(pc,wpbc[,featuresnames]),status=wpbc$status))
#plot(PCAwpbc[,colnames(PCAwpbc)!="status"],col=wpbc$status,cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)
#pander::pander(pc$rotation)
PCACor <- cor(PCAwpbc[,colnames(PCAwpbc)!="status"])
gplots::heatmap.2(abs(PCACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "PCA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")

EFA
featuresnames <- colnames(wpbc)[colnames(wpbc) != "status"]
uls <- fa(wpbc[,featuresnames],length(colnames(PCAwpbc)),rotate="varimax",warnings=FALSE) # EFA analysis
EFAwpbc <- as.data.frame(cbind(predict(uls,wpbc[,featuresnames]),status=wpbc$status))
#plot(EFAwpbc[,colnames(EFAwpbc)!="status"],col=wpbc$status,cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)
#pander::pander(uls$weights)
EFACor <- cor(EFAwpbc[,colnames(EFAwpbc)!="status"])
gplots::heatmap.2(abs(EFACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "EFA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")

Effect on CAR
modeling
par(op)
par(xpd = TRUE)
wpbc$status <- factor(wpbc$status)
rawmodel <- rpart(status~.,wpbc,control=rpart.control(maxdepth=2))
plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(rawmodel, use.n = TRUE,cex=0.75)

pr <- predict(rawmodel,wpbc,type = "class")
pander::pander(table(wpbc$status,pr))
pander::pander(c(accuracy=sum(wpbc$status==pr)/nrow(wpbc)))
IDeAwpbc$status <- factor(IDeAwpbc$status)
IDeAmodel <- rpart(status~.,IDeAwpbc,control=rpart.control(maxdepth=2))
plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(IDeAmodel, use.n = TRUE,cex=0.75)

pr <- predict(IDeAmodel,IDeAwpbc,type = "class")
pander::pander(table(IDeAwpbc$status,pr))
pander::pander(c(accuracy=sum(IDeAwpbc$status==pr)/nrow(wpbc)))
PCAwpbc$status <- factor(PCAwpbc$status)
PCAmodel <- rpart(status~.,PCAwpbc,control=rpart.control(maxdepth=2))
plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(PCAmodel, use.n = TRUE,cex=0.75)

pr <- predict(PCAmodel,PCAwpbc,type = "class")
pander::pander(table(PCAwpbc$status,pr))
pander::pander(c(accuracy=sum(PCAwpbc$status==pr)/nrow(wpbc)))
EFAwpbc$status <- factor(EFAwpbc$status)
EFAmodel <- rpart(status~.,EFAwpbc,control=rpart.control(maxdepth=2))
plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(EFAmodel, use.n = TRUE,cex=0.75)

pr <- predict(EFAmodel,EFAwpbc,type = "class")
pander::pander(table(EFAwpbc$status,pr))
pander::pander(c(accuracy=sum(EFAwpbc$status==pr)/nrow(wpbc)))
par(op)
Effect on logisitc
modeling
wpbc$status <- 1*(wpbc$status == 1)
IDeAwpbc$status <- wpbc$status
PCAwpbc$status <- wpbc$status
EFAwpbc$status <- wpbc$status
rawmodel <- LASSO_MIN(status~.,wpbc,family="binomial")
pander::pander(rawmodel$coef)
| -1.98 |
0.000569 |
-15.5 |
0.00826 |
0.0247 |
IDeAmodel <- LASSO_MIN(status~.,IDeAwpbc,family="binomial")
pander::pander(IDeAmodel$coef)
| -0.437 |
0.129 |
0.276 |
2.41 |
-38.5 |
0.095 |
-133 |
0.0169 |
0.0208 |
PCAmodel <- LASSO_MIN(status~.,PCAwpbc,family="binomial")
pander::pander(PCAmodel$coef)
EFAmodel <- LASSO_MIN(status~.,EFAwpbc,family="binomial")
pander::pander(EFAmodel$coef)
par(op)
Standarize the
names for the reporting
studyName <- "Wisconsin"
dataframe <- wpbc
outcome <- "status"
thro <- 0.80
TopVariables <- 10
cexheat = 0.25
Generaring the
report
Libraries
Some libraries
library(psych)
library(whitening)
library("vioplot")
Data specs
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
pander::pander(table(dataframe[,outcome]))
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1500
Scaling the
data
Scaling and removing near zero variance columns and highly
co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData
The heatmap of the
data
if (!largeSet)
{
hm <- heatMaps(data=dataframe,
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}

Correlation
Matrix of the Data
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9961165
The
decorrelation
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> Included: 32 , Uni p: 0.02425652 , Uncorrelated Base: 8 , Outcome-Driven Size: 0 , Base Size: 8
#>
#>
1 <R=0.996,r=0.973,N= 6>, Top: 2( 2 )[ 1 : 2 Fa= 2 : 0.973 ]( 2 , 4 , 0 ),<|>Tot Used: 6 , Added: 4 , Zero Std: 0 , Max Cor: 0.968
#>
2 <R=0.968,r=0.959,N= 6>, Top: 1( 1 )[ 1 : 1 Fa= 3 : 0.959 ]( 1 , 1 , 2 ),<|>Tot Used: 8 , Added: 1 , Zero Std: 0 , Max Cor: 0.928
#>
3 <R=0.928,r=0.914,N= 4>, Top: 2( 1 )[ 1 : 2 Fa= 3 : 0.914 ]( 2 , 2 , 3 ),<|>Tot Used: 9 , Added: 2 , Zero Std: 0 , Max Cor: 0.893
#>
4 <R=0.893,r=0.846,N= 10>, Top: 5( 1 )[ 1 : 5 Fa= 8 : 0.846 ]( 5 , 5 , 3 ),<|>Tot Used: 18 , Added: 5 , Zero Std: 0 , Max Cor: 0.837
#>
5 <R=0.837,r=0.819,N= 10>, Top: 3( 1 )[ 1 : 3 Fa= 8 : 0.819 ]( 3 , 3 , 8 ),<|>Tot Used: 20 , Added: 3 , Zero Std: 0 , Max Cor: 0.783
#>
6 <R=0.783,r=0.800,N= 10>
#>
[ 6 ], 0.7809466 Decor Dimension: 20 Nused: 20 . Cor to Base: 11 , ABase: 2 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
32.9
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
21.8
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
4.95
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
4.66
The decorrelation
matrix
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPSTM <- attr(DEdataframe,"UPSTM")
gplots::heatmap.2(1.0*(abs(UPSTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}

The correlation
matrix after decorrelation
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.7827266
U-MAP Visualization
of features
The UMAP based on
LASSO on Raw Data
classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

The decorralted
UMAP
datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

Univariate
Analysis
Univariate
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
Final Table
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
| mean_perimeter |
0.489 |
0.908 |
-0.13157 |
0.779 |
4.26e-01 |
0.697 |
| mean_area |
0.556 |
0.974 |
-0.09535 |
0.782 |
9.68e-02 |
0.696 |
| mean_radius |
0.486 |
0.904 |
-0.13511 |
0.801 |
3.28e-01 |
0.693 |
| worst_radius |
0.570 |
0.949 |
-0.06074 |
0.822 |
2.93e-01 |
0.691 |
| worst_perimeter |
0.591 |
0.977 |
-0.05457 |
0.808 |
5.49e-01 |
0.691 |
| worst_area |
0.656 |
1.077 |
0.00399 |
0.852 |
2.72e-01 |
0.683 |
| tsize |
0.591 |
0.988 |
0.08954 |
1.116 |
3.18e-03 |
0.680 |
| mean_fractaldim |
-0.252 |
0.687 |
0.22329 |
0.927 |
6.06e-01 |
0.657 |
| pnodes |
1.014 |
1.373 |
0.36684 |
1.140 |
2.35e-08 |
0.655 |
| SE_area |
0.594 |
1.033 |
0.09745 |
0.869 |
3.92e-02 |
0.653 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]
pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
| mean_radius |
0.48605 |
0.9044 |
-0.13511 |
0.8011 |
3.28e-01 |
0.693 |
| tsize |
0.59083 |
0.9883 |
0.08954 |
1.1155 |
3.18e-03 |
0.680 |
| pnodes |
1.01412 |
1.3734 |
0.36684 |
1.1399 |
2.35e-08 |
0.655 |
| SE_area |
0.59388 |
1.0326 |
0.09745 |
0.8689 |
3.92e-02 |
0.653 |
| La_mean_fractaldim |
-0.13729 |
0.2637 |
0.06482 |
0.4889 |
7.68e-01 |
0.644 |
| La_mean_concavity |
0.14360 |
0.5879 |
-0.13548 |
0.4464 |
8.26e-01 |
0.635 |
| La_mean_concavepoints |
0.12493 |
0.4004 |
-0.03533 |
0.4642 |
7.48e-01 |
0.614 |
| La_worst_compactness |
0.20999 |
0.7221 |
-0.05690 |
0.6046 |
7.11e-01 |
0.607 |
| La_mean_perimeter |
0.00251 |
0.0337 |
-0.00841 |
0.0336 |
6.07e-02 |
0.603 |
| La_SE_fractaldim |
-0.16095 |
0.5129 |
0.06813 |
0.5573 |
3.05e-01 |
0.603 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))
theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
allSigvars <- names(dc)
dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
coef <- theFormulas[[dx]]
cname <- names(theFormulas[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("%5.3f*%s",coef[cf],cname[cf]))
}
}
}
}
finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| mean_perimeter |
NA |
0.48901 |
0.9082 |
-0.13157 |
0.7788 |
4.26e-01 |
0.697 |
0.697 |
NA |
| mean_radius |
|
0.48605 |
0.9044 |
-0.13511 |
0.8011 |
3.28e-01 |
0.693 |
0.693 |
3 |
| tsize |
|
0.59083 |
0.9883 |
0.08954 |
1.1155 |
3.18e-03 |
0.680 |
0.680 |
NA |
| mean_fractaldim |
NA |
-0.25171 |
0.6868 |
0.22329 |
0.9273 |
6.06e-01 |
0.657 |
0.657 |
NA |
| pnodes |
|
1.01412 |
1.3734 |
0.36684 |
1.1399 |
2.35e-08 |
0.655 |
0.655 |
NA |
| SE_area |
|
0.59388 |
1.0326 |
0.09745 |
0.8689 |
3.92e-02 |
0.653 |
0.653 |
2 |
| La_mean_fractaldim |
+ 1.000mean_fractaldim
-0.759worst_fractaldim |
-0.13729 |
0.2637 |
0.06482 |
0.4889 |
7.68e-01 |
0.644 |
0.657 |
-1 |
| La_mean_concavity |
-0.706mean_compactness +
1.000mean_concavity |
0.14360 |
0.5879 |
-0.13548 |
0.4464 |
8.26e-01 |
0.635 |
0.595 |
0 |
| mean_concavepoints |
NA |
0.38302 |
1.0586 |
-0.06750 |
0.9633 |
2.98e-01 |
0.619 |
0.619 |
NA |
| La_mean_concavepoints |
-0.992mean_concavity +
1.000mean_concavepoints |
0.12493 |
0.4004 |
-0.03533 |
0.4642 |
7.48e-01 |
0.614 |
0.619 |
-1 |
| La_worst_compactness |
+ 1.000worst_compactness
-0.968worst_fractaldim |
0.20999 |
0.7221 |
-0.05690 |
0.6046 |
7.11e-01 |
0.607 |
0.516 |
-1 |
| La_mean_perimeter |
-0.980mean_radius + 1.000mean_perimeter
-0.063*mean_compactness |
0.00251 |
0.0337 |
-0.00841 |
0.0336 |
6.07e-02 |
0.603 |
0.697 |
-2 |
| La_SE_fractaldim |
-0.873SE_compactness +
1.000SE_fractaldim |
-0.16095 |
0.5129 |
0.06813 |
0.5573 |
3.05e-01 |
0.603 |
0.549 |
-1 |
| worst_fractaldim |
NA |
-0.15068 |
0.8304 |
0.20868 |
1.0616 |
3.22e-01 |
0.598 |
0.598 |
2 |
| mean_concavity |
NA |
0.26012 |
0.9174 |
-0.03242 |
0.8892 |
6.78e-01 |
0.595 |
0.595 |
NA |
| SE_fractaldim |
NA |
-0.01848 |
0.8634 |
0.23441 |
1.1722 |
4.15e-01 |
0.549 |
0.549 |
NA |
| SE_compactness |
NA |
0.16325 |
1.1390 |
0.19053 |
1.0887 |
2.15e-01 |
0.525 |
0.525 |
2 |
| worst_compactness |
NA |
0.06410 |
1.0995 |
0.14515 |
1.2064 |
3.09e-01 |
0.516 |
0.516 |
NA |
| mean_compactness |
NA |
0.16493 |
0.9543 |
0.14588 |
1.1026 |
5.38e-01 |
0.509 |
0.509 |
2 |